Analysis question:

how does loop length correlate with SNR?

library(dplyr)
library(purrr)
library(readr)
library(stringr)
library(ggplot2)
library(viridis)
calc_gc_content <- function(sq) {
  num_g <- str_count(sq, "G")
  num_c <- str_count(sq, "C")
  gc_content <- (num_g + num_c) / str_length(sq) 
  #print(gc_content)
  return(gc_content)
 }
file<-'imotif_array_v2.median_SNR.with.hnRNPA1.avg_reps.genes.loops.tab'
data<-read.csv(file, sep='\t')
prctcutoff<-10

data$length_loop1<-nchar(data$loop1)
data$length_loop2<-nchar(data$loop2)
data$length_loop3<-nchar(data$loop3)

for (i in 1:nrow(data)){
  data$gc_loop1[i] <- calc_gc_content(data$loop1[i])
  data$gc_loop2[i] <- calc_gc_content(data$loop2[i])
  data$gc_loop3[i] <- calc_gc_content(data$loop3[i])
}

hnRNPK

top 10% of hnRNPK binders fall into only three sequence classes:

experiments<-c("hnRNPK_rep", "hnRNPLL_rep", "ASF_rep")
x<-experiments[1]
inds<- which(data[,x] > quantile(data[,x],prob=1-prctcutoff/100))
tmp<-data
tmp$cutoff<-"NA"
tmp$cutoff[inds]<-"top10"

print("Sequence classses in top 10prctile of HNRPK binding")
## [1] "Sequence classses in top 10prctile of HNRPK binding"
table(data$SeqClass[inds])
## 
##        3_Cs        4_Cs     5_10_Cs pos_control 
##          27         230         923           2

Boxplots

loop lengths

library(ComplexHeatmap)
library(circlize)

groups <-c('2_Cs','3_Cs','4_Cs','5_10_Cs')


for(group in groups){
  tmp2<-tmp[which(tmp$SeqClass==group),]
  
  # bin lengths
  tmp2$length_loop1[tmp2$length_loop1>9] <- paste0("\u2265","10")
  tmp2$length_loop2[tmp2$length_loop2>9] <- paste0("\u2265","10")
  tmp2$length_loop3[tmp2$length_loop3>9] <- paste0("\u2265","10")
  
  tmp2$length_loop1<-factor(tmp2$length_loop1,levels=c(seq(1,9), paste0("\u2265","10")))
  tmp2$length_loop2<-factor(tmp2$length_loop2,levels=c(seq(1,9), paste0("\u2265","10")))
  tmp2$length_loop3<-factor(tmp2$length_loop3,levels=c(seq(1,9), paste0("\u2265","10")))
  mat1<-tmp2[,c("length_loop1","length_loop2","length_loop3","iMab100nM_6.5_40","hnRNPK_rep", "hnRNPLL_rep", "ASF_rep","Mx2nM_6.5_40")]
  
  # rename experiments
  names(mat1)[4:ncol(mat1)] <- c('iMab','hnRNPK','hnRNPLL','ASF/SF1','Mitoxantrone')
  
  plot_data<-tidyr::pivot_longer(mat1,cols=c(4:ncol(mat1)),
                                 names_to =c("experiment"),
                                 values_to="SNR")
  
  plot_data<-tidyr::pivot_longer(plot_data,cols=c(1:3),
                                 names_to =c("loop_number"),
                                 values_to="loop_length")
  
  
  p<-ggplot(plot_data, aes(x=loop_length,y=SNR,group=loop_length,color=loop_length)) + 
    geom_boxplot() +
    theme_bw() +
    facet_grid(cols=vars(loop_number),rows=vars(experiment),scale='free_y') + 
    ggtitle(group) +
  #  scale_color_viridis(option="magma",  discrete = T)
    scale_color_viridis( discrete = T) 


  print(p)
}

loop gc content

for(group in groups){
  tmp2<-tmp[which(tmp$SeqClass==group),]
  
  # round gc content
  tmp2$gc_loop1 <- round(tmp2$gc_loop1,1)
  tmp2$gc_loop2 <-round(tmp2$gc_loop2,1)
  tmp2$gc_loop3<-round(tmp2$gc_loop3,1)
  tmp2$gc_loop1<-factor(tmp2$gc_loop1,levels=c(seq(0,1,0.1)))
  tmp2$gc_loop2<-factor(tmp2$gc_loop2,levels=c(seq(0,1,0.1)))
  tmp2$gc_loop3<-factor(tmp2$gc_loop3,levels=c(seq(0,1,0.1)))
  mat1<-tmp2[,c("gc_loop1","gc_loop2","gc_loop3","iMab100nM_6.5_40","hnRNPK_rep", "hnRNPLL_rep", "ASF_rep","Mx2nM_6.5_40")]
  
  plot_data<-tidyr::pivot_longer(mat1,cols=c(4:ncol(mat1)),
                                 names_to =c("experiment"),
                                 values_to="SNR")
  
  plot_data<-tidyr::pivot_longer(plot_data,cols=c(1:3),
                                 names_to =c("loop_number"),
                                 values_to="gc")
  
  
  p<-ggplot(plot_data, aes(x=gc,y=SNR,group=gc,color=gc)) + 
    geom_boxplot() +
    theme_bw() +
    facet_grid(cols=vars(loop_number),rows=vars(experiment),scale='free_y') + 
    ggtitle(group)+
        scale_color_viridis( discrete = T)

  print(p)
}

Density

loop lengths

groups <-c('2_Cs','3_Cs','4_Cs','5_10_Cs')


for(group in groups){
 
  plot_data<-tidyr::pivot_longer(mat1,cols=c(4:ncol(mat1)),
                                 names_to =c("experiment"),
                                 values_to="SNR")
  
  plot_data<-tidyr::pivot_longer(plot_data,cols=c(1:3),
                                 names_to =c("loop_number"),
                                 values_to="loop_length")
  
  
  p<-ggplot(plot_data, aes(x=SNR,group=loop_length,color=loop_length)) + 
    geom_density()+
    theme_bw() +
    facet_grid(cols=vars(loop_number),rows=vars(experiment),scale='free') + 
    ggtitle(group) +
  #  scale_color_viridis(option="magma",  discrete = T)
    scale_color_viridis( discrete = T)

  print(p)
}

loop gc content

for(group in groups){

  plot_data<-tidyr::pivot_longer(mat1,cols=c(4:ncol(mat1)),
                                 names_to =c("experiment"),
                                 values_to="SNR")
  
  plot_data<-tidyr::pivot_longer(plot_data,cols=c(1:3),
                                 names_to =c("loop_number"),
                                 values_to="gc")
  
  p<-ggplot(plot_data, aes(x=SNR,group=gc,color=gc)) + 
    geom_density() +
    theme_bw() +
    facet_grid(cols=vars(loop_number),rows=vars(experiment),scale='free_y') + 
    ggtitle(group)+
        scale_color_viridis( discrete = T)

  print(p)
}

# mat1<-tmp2[,c(x,"length_loop1","length_loop2","length_loop3")]
# mat2<-tmp2[,c(x,"gc_loop1","gc_loop2","gc_loop3")]
# col_fun= colorRamp2(c(1,10),c("yellow","blue"))
# col_fun_gc = colorRamp2(c(0,0.5,1),c("#edf8b1","#7fcdbb","#2c7fb8"))
# mat1<-mat1[order(mat1[,1],decreasing=T),]
# mat2<-mat2[order(mat2[,1],decreasing=T),]
# ComplexHeatmap::Heatmap(mat1[,2:4],
#                         col=col_fun,
#                         #row_names_side = "left", 
#                         cluster_rows=F,
#                         cluster_columns=F,
#                         show_row_names = F,
#                         column_title="3_Cs"
# 
# )